{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Settings\n",
    "\n",
    "# See \"./sample input files\" for example - solr / lucene stop word file format (one word per line, text file)\n",
    "STOP_WORDS_FILE = \"/Users/simon.hughes/Software/Solr/solr-5.1.0/server/solr/DiceJobs/conf/dice_stop_words.txt\"\n",
    "\n",
    "# keywords generated in step 2, or manually configured from your search logs\n",
    "# this is expected to be in the format of a solr / lucene / ES synonym file. The python code mimics the solr\n",
    "# analysis chain logic, and can be used to filter the text to just those words and phrases in the keyword file\n",
    "# See \"./sample input files\" for examples\n",
    "KEY_WORD_FILES = [\"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt\", \n",
    "                  # this is a file we generated from our search log analysis. The file above was generated in the previous step\n",
    "                  # only one file is needed, it depends what keywords / phrases are important to your domain\n",
    "                  \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\"]\n",
    "\n",
    "DOCS_FOLDER  = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs\"\n",
    "MODEL_FILE = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v\"\n",
    "FILE_MASK = \".*\\.txt\"\n",
    "MIN_SENT_LENGTH = 5\n",
    "\n",
    "# W2Vec settings\n",
    "MIN_WD_COUNT = 10 #for word2vec model # setting to 10 seems to remove some of the noise\n",
    "WINDOW_SIZE  = 5\n",
    "VECTOR_SIZE  = 300\n",
    "WORKERS = 8\n",
    "TRAINING_ITERATIONS = 15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Shared\n",
    "import re\n",
    "from collections import defaultdict\n",
    "\n",
    "def load_stop_words(stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "            for line in f:\n",
    "                word = line.strip()\n",
    "                if word[0] != \"#\":\n",
    "                    word = word.lower()\n",
    "                    stop_words.add(word)\n",
    "    return stop_words\n",
    "\n",
    "re_collapse_spaces = re.compile(\"\\s+\")\n",
    "def collapse_spaces(s):\n",
    "    return re_collapse_spaces.sub(\" \", s)\n",
    "\n",
    "re1 = re.compile(\"[;:\\'\\\"\\*/\\),\\(\\|\\s]+\")\n",
    "def clean_str(s):\n",
    "    s = str(s).replace(\"'s\",\" \")\n",
    "    #doesn't work in regex\n",
    "    s = s.replace(\"-\", \" \").replace(\"\\\\\",\" \")\n",
    "    s = re1.sub(\" \",s).strip()\n",
    "    return collapse_spaces(s)\n",
    "\n",
    "def find_files(folder, regex, remove_empty = False):\n",
    "    \"\"\"\n",
    "    Find all files matching the [regex] pattern in [folder]\n",
    "\n",
    "    folder  :   string\n",
    "                    folder to search (not recursive)\n",
    "    regex   :   string (NOT regex object)\n",
    "                    pattern to match\n",
    "    \"\"\"\n",
    "    files = os.listdir(folder)\n",
    "    matches = [os.path.abspath(os.path.join(folder, f))\n",
    "               for f in files\n",
    "               if re.search(regex, f, re.IGNORECASE)]\n",
    "\n",
    "    if remove_empty:\n",
    "        matches = [f for f in matches if os.path.getsize(f) > 0]\n",
    "    matches.sort()\n",
    "    return matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "class SynonymMapper(object):\n",
    "    def __init__(self, mapper, nested, case_sensitive=False):\n",
    "        self.case_sensitive = case_sensitive\n",
    "        self.mapper = mapper\n",
    "        self.nested = nested\n",
    "        self.synonyms = set()\n",
    "        for rhs in self.mapper.values():\n",
    "            for syn in rhs:\n",
    "                self.synonyms.add(syn)\n",
    "        \n",
    "    def is_synonym(self, term):\n",
    "        return term in self.synonyms\n",
    "        \n",
    "    def map_synonyms(self, tokens, debug=False):\n",
    "        mapped = []\n",
    "        size = len(tokens)\n",
    "        if not self.case_sensitive:\n",
    "            tmp_tokens = map(lambda s: s.lower(), tokens)\n",
    "        else:\n",
    "            tmp_tokens = tokens\n",
    "        ix = 0\n",
    "        while ix < size:\n",
    "            if debug:\n",
    "                print \"ix\", ix\n",
    "            best, best_key = None, None\n",
    "            tmp_ix = ix        \n",
    "            max_ix = ix\n",
    "            current = \"\"\n",
    "            d = self.nested\n",
    "            while tmp_ix < size and tmp_tokens[tmp_ix] in d:\n",
    "                current += tmp_tokens[tmp_ix] + \" \"\n",
    "                key = current.strip()\n",
    "                if key in self.mapper:\n",
    "                    if debug:\n",
    "                        if best is not None:\n",
    "                            print(ix, tmp_ix, \"new best:\", key, \"=>\", self.mapper[key])\n",
    "                        else:\n",
    "                            print(ix, tmp_ix, \"best:\", key, \"=>\", self.mapper[key])\n",
    "                    best = self.mapper[key]\n",
    "                    best_key = key\n",
    "                    max_ix = tmp_ix                    \n",
    "                d = d[tmp_tokens[tmp_ix]]\n",
    "                tmp_ix += 1\n",
    "            if not best:\n",
    "                #retain original casing\n",
    "                mapped.append(tokens[ix])\n",
    "            else:\n",
    "                ix = max_ix\n",
    "                #yields a set\n",
    "                for item in sorted(best):\n",
    "                    mapped.append(item)\n",
    "            ix += 1\n",
    "        return mapped\n",
    "\n",
    "    def __repr__(self):\n",
    "        return \"Synonym Mapper: %i synonyms mapped\" % len(self.mapper)\n",
    "\n",
    "def build_synonym_filter(files, case_sensitive=False):\n",
    "    # recursively define a defaultdict generator\n",
    "    mapper = defaultdict(set)\n",
    "    def dd():\n",
    "        return defaultdict(dd)\n",
    "    nested_map = defaultdict(dd)\n",
    "    file_locn = dict()\n",
    "    if type(files) == str:\n",
    "        files = [files]\n",
    "    for f in files:\n",
    "        with open(f, \"r+\") as fin:\n",
    "            for line in fin:\n",
    "                line = line.strip()\n",
    "                if len(line) > 0 and not line[0] == \"#\":\n",
    "                    if \"=>\" in line:\n",
    "                        left, right = line.split(\"=>\")\n",
    "                        right = set(right.split(\",\"))\n",
    "                        left_parts = left.split(\",\")\n",
    "                    else:\n",
    "                        left_parts = line.split(\",\")\n",
    "                        right = set(left_parts)\n",
    "\n",
    "                    for syn in left_parts:\n",
    "                        for rhs in right:\n",
    "                            mapper[syn].add(rhs)\n",
    "                        file_locn[syn] = f\n",
    "\n",
    "                        tokens = syn.split(\" \")\n",
    "                        prev = tokens[0]\n",
    "                        d = nested_map[prev]\n",
    "                        for token in tokens[1:]:\n",
    "                            d = d[token]\n",
    "                            prev = token                        \n",
    "    return SynonymMapper(mapper, nested_map, case_sensitive)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#String processing\n",
    "def white_space_tokenize(s):\n",
    "    return s.split(\" \")\n",
    "\n",
    "__punct__ = set(\".?!,;:\")\n",
    "def remove_punct_at_end(s):\n",
    "    while len(s) > 1 and s[-1] in __punct__:\n",
    "        s = s[:-1]\n",
    "    return s\n",
    "\n",
    "#Token Filters\n",
    "def fact_len_filter(max_len):\n",
    "    def len_filter(tokens):\n",
    "        return filter(lambda s: len(s) >= max_len, tokens)\n",
    "    return len_filter\n",
    "\n",
    "remove_empty_tokens_filter = fact_len_filter(1)\n",
    "\n",
    "def lower_case_filter(tokens):\n",
    "    if type(tokens) == str:\n",
    "        return tokens.lower()\n",
    "    return map(lambda t: t.lower(), tokens)\n",
    "\n",
    "__punct__ = set(\".?!,;:\")\n",
    "\n",
    "def remove_punct_at_end_filter(tokens):\n",
    "    return map(remove_punct_at_end, tokens)\n",
    "\n",
    "def fact_is_synonym_filter(syn_mapper):\n",
    "    def is_synonym_filter(tokens):\n",
    "        return filter(syn_mapper.is_synonym, tokens)\n",
    "    return is_synonym_filter\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def fact_stop_word_filter(case_sensitive, stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "        for line in f:\n",
    "            word = line.strip()\n",
    "            if word[0] != \"#\":\n",
    "                if not case_sensitive:\n",
    "                    word = word.lower()\n",
    "                stop_words.add(word)\n",
    "\n",
    "    def cs_stop_filter(tokens):\n",
    "        return [tok for tok in tokens if tok not in stop_words]\n",
    "\n",
    "    def stop_filter(tokens):\n",
    "        return [tok for tok in tokens if tok.lower() not in stop_words]\n",
    "\n",
    "    if case_sensitive:\n",
    "        return cs_stop_filter\n",
    "    else:\n",
    "        return stop_filter\n",
    "\n",
    "stop_filter = fact_stop_word_filter(False, STOP_WORDS_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def analyze(s, filters):\n",
    "    temp = s\n",
    "    for f in filters:\n",
    "        temp = f(temp)\n",
    "    return temp\n",
    "\n",
    "def debug_analyze(s, filters):\n",
    "    temp = s\n",
    "    pad = 20\n",
    "    print \"START\".ljust(pad), temp\n",
    "    for f in filters:\n",
    "        temp = f(temp)\n",
    "        if type(temp) == list:\n",
    "            s_temp = \"|\".join(map(str,temp))\n",
    "        else:\n",
    "            s_temp = str(temp)\n",
    "        print f.func_name.ljust(pad), s_temp\n",
    "    return temp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Synonym Mapper: 20123 synonyms mapped"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "syn_mapper = build_synonym_filter(KEY_WORD_FILES, False)\n",
    "syn_mapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "START                $150k as400 Sr.\\ Java/j2ee and the C#.! developer. FIT \"HOT\" dev. -IBM's business, sql server management\n",
      "clean_str            $150k as400 Sr. Java j2ee and the C#.! developer. FIT HOT dev. IBM business sql server management\n",
      "white_space_tokenize $150k|as400|Sr.|Java|j2ee|and|the|C#.!|developer.|FIT|HOT|dev.|IBM|business|sql|server|management\n",
      "remove_punct_at_end_filter $150k|as400|Sr|Java|j2ee|and|the|C#|developer|FIT|HOT|dev|IBM|business|sql|server|management\n",
      "lower_case_filter    $150k|as400|sr|java|j2ee|and|the|c#|developer|fit|hot|dev|ibm|business|sql|server|management\n",
      "stop_filter          $150k|as400|sr|java|j2ee|c#|developer|fit|dev|ibm|business|sql|server|management\n",
      "map_synonyms         $150k|as400|sr java|j2ee|c# developer|fit|dev|ibm|business|sql server management\n",
      "len_filter           $150k|as400|sr java|j2ee|c# developer|fit|dev|ibm|business|sql server management\n"
     ]
    }
   ],
   "source": [
    "#Skills from text\n",
    "is_a_synonym_filter = fact_is_synonym_filter(syn_mapper)\n",
    "analysis_chain = [clean_str,\n",
    "                  white_space_tokenize,\n",
    "                  remove_punct_at_end_filter,\n",
    "                  lower_case_filter,\n",
    "                  stop_filter,\n",
    "                  syn_mapper.map_synonyms, \n",
    "                  remove_empty_tokens_filter]\n",
    "                  # is_a_synonym_filter] - Un-comment to just train on keywords. \n",
    "                  #                      - Best to train on all words, and then filter learned synonyms to keywords\n",
    "        \n",
    "                    \n",
    "\n",
    "#Test\n",
    "rslt = debug_analyze(\"$150k as400 Sr.\\ Java/j2ee and the C#.! developer. FIT \\\"HOT\\\" dev. -IBM's business, sql server management\", \n",
    "                     analysis_chain)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "66989 files found in /Users/simon.hughes/Documents/Dice Data/LuceneTalk/ProcessedDocs\n",
      "Loading 2376615 sentences took 15.9088010788 seconds\n"
     ]
    }
   ],
   "source": [
    "import os, re, time\n",
    "start = time.time()\n",
    "\n",
    "sentences = []\n",
    "files = find_files(DOCS_FOLDER, FILE_MASK, True)\n",
    "print(\"%s files found in %s\" % (len(files), DOCS_FOLDER))\n",
    "\n",
    "documents = []\n",
    "for i, fname in enumerate(files):\n",
    "    with open(fname) as f:\n",
    "        contents = f.read()\n",
    "        sentences.extend(contents.split(\"\\n\"))\n",
    "end = time.time()\n",
    "print(\"Loading %i sentences took %s seconds\" % (len(sentences), str(end - start)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2376615\n",
      "Tokenizing sentences\n",
      "0\n",
      "100000\n",
      "200000\n",
      "300000\n",
      "400000\n",
      "500000\n",
      "600000\n",
      "700000\n",
      "800000\n",
      "900000\n",
      "1000000\n",
      "1100000\n",
      "1200000\n",
      "1300000\n",
      "1400000\n",
      "1500000\n",
      "1600000\n",
      "1700000\n",
      "1800000\n",
      "1900000\n",
      "2000000\n",
      "2100000\n",
      "2200000\n",
      "2300000\n"
     ]
    }
   ],
   "source": [
    "print len(sentences)\n",
    "tokenized = []\n",
    "print(\"Tokenizing sentences\")\n",
    "for i, sent in enumerate(sentences):\n",
    "    tokens = analyze(sent, analysis_chain)\n",
    "    if len(tokens) >= MIN_SENT_LENGTH:\n",
    "        tokenized.append(tokens)\n",
    "    if i % 100000 == 0:\n",
    "        print(i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gensim, time\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "\n",
    "start = time.time()\n",
    "\n",
    "print(\"Training Model. This could take a while (10-60 mins for moderate collections). Get a coffee\")\n",
    "model = Word2Vec(tokenized, iter=TRAINING_ITERATIONS, size=VECTOR_SIZE, window=WINDOW_SIZE, min_count=MIN_WD_COUNT, workers=WORKERS, sample=1e-5, hs=0, negative=20)\n",
    "model.save(MODEL_FILE)\n",
    "end = time.time()\n",
    "print \"Took %s seconds\" % (end - start)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#find the top n similar terms as below:\n",
    "#model.most_similar(positive=\"hadoop developer\",topn=10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:work]",
   "language": "python",
   "name": "conda-env-work-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
